# ------------------------------------------
#
#
# MUST OPEN WITH ENCODING SHIFT-JIS (as there is Japanese chars line 1788)
#
#
# ------------------------------------------







# ----------------------------------------
#
# Replication files
# Catalinac, Amy 
# Dominance Through Division: Group-based Clientelism in Japan
#-----------------------------------------









#-----------------------------------------
# CHAPTER 6
#
# #-----------------------------------------








#-----------------------------------------
# PREPARATION

setwd("C:/Users/Amy Catalinac/Dropbox/TARGETING/replication2018/ANALYSIS OF MASTER DATA")
options(max.print=1000000)

dat <- readRDS("Master_plus_Snow_Turn_Trans_Dis6.rds")
elec.dat <- dat[!is.na(dat$hor_electoral_district),]

library(dplyr)
library(readstata13)
library(stargazer)
library(lmtest)
library(plm)
library(clubSandwich)
library(multiwayvcov)
library(car)
library(ggplot2)
library(car)










# ------------------------------------------------
# TABLE A.1
#
# descriptive statistics for Chapter 6 

pre.red <- subset(elec.dat, is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)

# Make a variable for how much support 
# municipality gives candidate who places first in the district:
pre.red$votes_first <- pre.red$cand_01_votes/pre.red$mun_voting_pop

# vars we want in table:
df <- pre.red[, c("F1logngaid_pc", "logngaid_pc",
                  "bestLDP_VshareVP", "rbestLDP_VshareVP", "sumLDP_VshareVP", "rsumLDP_VshareVP",
                  "HighLDPVS", 
                  "bestLDPp_VshareVP", "bestSeniorLDP_VshareVP", "bestnonLDP_VshareVP", "bestloseLDP_VshareVP",
                  "votes_first",
                  "mun_ceif", "needy_pc", "primary_pc", "lnpop", "logincome_pc", "mun_population_density")]
stargazer(df)


















# ------------------------------------------------
# This function conducts regressions on panel data, 
# allowing different FE to be specified and different clustering of SEs,
# and adjusts clustering so that it clusters the way Stata does.
#
# For consistency with results in "A Tournament Theory of Pork Barrel Politics:
# The Case of Japan", Comparative Political Studies, 2020.
#
# ------------------------------------------------

stata_plm <- function(formula, data,
                      model = "pooling", effect = "individual",
                      cluster){ # NB: cluster variable has to be a character string specifying the name of the variable we want to
  # cluster on.  It also must be in either of the indices of pdata.frame.
  
  # Plus, we will build in sanity checks: first, is the "cluster" argument a character string?
  if(class(cluster) != "character"){
    cat("Error: argument 'cluster' must be a character string that specifies the name of the cluster variable.")
    
    # Second, is "cluster" argument in either one of the indices of the pdata.frame?
  } else if(!(cluster %in% names(index(data)))){
    cat("Error: clustering variable must be either of indices of the pdata.frame")
  } else{
    
    # Use the cluster argument to specify the "cluster" argument of vcovHC ( which must be either of "group" and "time"):
    if(which(cluster == names(index(data))) == 1){
      cluster_index <- "group"
    } else if (which(cluster == names(index(data))) == 2){
      cluster_index <- "time"
    } else{
      # Finally, just to make sure the clustering variable is in the indices:
      cat("Error: clustering variable not found in the indices.")
    }
    
    # Estimate the regression with plm:
    reg <- plm(formula, data, model = model, effect = effect)
    # Adjust the clustered SEs the way Stata does them:
    len_clst <- length(unique(data[,cluster]))
    vcov <- vcovHC(reg, type = "HC1", method = "arellano",
                   cluster = cluster_index) * (len_clst/(len_clst-1))
    reg$vcov <- vcov
    return(reg)
  }
}









# ------------------------------------------------
# TABLE 6.1 
# (within_dist_T1)

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

pre.red <- subset(elec.dat, elec.dat$year != 2009 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

# ---------------------------
# best

mod1 <- stata_plm(paste("F1logngaid_pc ~ bestLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod1)

# ---------------------------
# rbest

mod2 <- stata_plm(paste("F1logngaid_pc ~ rbestLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod2)

# ---------------------------
# sum

mod5 <- stata_plm(paste("F1logngaid_pc ~ sumLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod5)

# ---------------------------
# rsum

mod6 <- stata_plm(paste("F1logngaid_pc ~ rsumLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod6)

# --------------------------- 
# Print table:

stargazer(mod1, mod2, mod5, mod6,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))







# -----------------------------
# substantive effect for mod1

newdat <- mod1$model

# Sample mean of DV:
mean(na.omit(newdat$F1logngaid_pc))

# Exponentiate this:
exp(mean(na.omit(newdat$F1logngaid_pc)))

# Multiply by units for raw DV (hyaku man en)
exp(mean(na.omit(newdat$F1logngaid_pc)))*1000000
# 29705.37
sample_mean <- exp(mean(na.omit(newdat$F1logngaid_pc)))*1000000
sample_mean
# 29705.37

# Effect of a 1-SD increase in bestLDP_VshareVP
mod1$coefficients[1]*sd(newdat$bestLDP_VshareVP)
# 0.02573035 

# Exponentiate this as DV is logged:
exp(mod1$coefficients[1]*sd(newdat$bestLDP_VshareVP))
# 1.026064

# Express as percentage (as log DV):
(exp(mod1$coefficients[1]*sd(newdat$bestLDP_VshareVP))-1)*100
# 2.611978

# Increase over sample mean:
(((exp(mod1$coefficients[1]*sd(newdat$bestLDP_VshareVP))-1)*100)/100)*sample_mean
# 775.8977 
increase_over_sm <- (((exp(mod1$coefficients[1]*sd(newdat$bestLDP_VshareVP))-1)*100)/100)*sample_mean

# Mean municipality population:
exp(mean(newdat$lnpop))
# 13483

# How much extra money muni gets 
exp(mean(newdat$lnpop))*increase_over_sm












# ------------------------------------------------
# TABLE 6.2 
# (within_dist_T2)
#

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

pre.red <- subset(elec.dat, elec.dat$year <= 1993 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

# -----------------------------
# best

mod1 <- stata_plm(paste("F1logngaid_pc ~ bestLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "twoways",
                  cluster = "muncode_num")
summary(mod1)

# ---------------------------
# rbest

mod2 <- stata_plm(paste("F1logngaid_pc ~ rbestLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "twoways",
                  cluster = "muncode_num")
summary(mod2)

# ---------------------------
mod5 <- stata_plm(paste("F1logngaid_pc ~ sumLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "twoways",
                  cluster = "muncode_num")
summary(mod5)

# ---------------------------
# rsum

mod6 <- stata_plm(paste("F1logngaid_pc ~ rsumLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "twoways",
                  cluster = "muncode_num")
summary(mod6)

# --------------------------- 
# Print table:

stargazer(mod1, mod2, mod5, mod6,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))









# ------------------------------------------------
# Calculating within-municipality shifts in electoral support and rankings 

# For each muncode_num how many times is it in data?
unique_munis <- unique(pre.red$muncode_num)
length(unique_munis)
counts <- c()

for(i in 1:length(unique_munis)){
  d <- subset(pre.red, pre.red$muncode_num==unique_munis[i])
  count <- nrow(d)
  counts <- c(counts, count)
}
dta <- data.frame(unique_munis, counts)
summary(dta$counts)
sum(dta$counts==5)
# 3149

# How much within-municipality movement in key variables?
check <- tapply(pre.red.pdf$bestLDP_VshareVP, pre.red.pdf$muncode_num, sd)
check <- as.data.frame(check)
dim(check)
summary(check[,1])
# mean: 0.06329

check <- tapply(pre.red.pdf$sumLDP_VshareVP, pre.red.pdf$muncode_num, sd)
check <- as.data.frame(check)
summary(check[,1])
# mean: 0.11

check <- tapply(pre.red.pdf$rsumLDP_VshareVP, pre.red.pdf$muncode_num, sd)
check <- as.data.frame(check)
summary(check[,1])
# mean: 0.167

# We can use our variable capturing absolute rankings:
check <- tapply(pre.red.pdf$abs.rsumLDP_VshareVP, pre.red.pdf$muncode_num, sd)
check <- as.data.frame(check)
summary(check[,1])
# mean: 5.8
boxplot(check[,1])











# ------------------------------------------------
# TABLE A.2
#
# (within_dist_high_sum)
#

pre.red <- subset(elec.dat, elec.dat$year <= 1993 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

controlsH <- "HighLDPVS + mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

mod9 <- stata_plm(paste("F1logngaid_pc ~ bestLDP_VshareVP +
                      logngaid_pc + ",
                        controlsH), pre.red.pdf,
                  model = "within", effect = "twoways",
                  cluster = "muncode_num")

summary(mod9)

mod10 <- stata_plm(paste("F1logngaid_pc ~ rbestLDP_VshareVP +
                      logngaid_pc + ",
                         controlsH), pre.red.pdf,
                   model = "within", effect = "twoways",
                   cluster = "muncode_num")
summary(mod10)

mod11 <- stata_plm(paste("F1logngaid_pc ~ sumLDP_VshareVP +
                      logngaid_pc + ",
                         controlsH), pre.red.pdf,
                   model = "within", effect = "twoways",
                   cluster = "muncode_num")
summary(mod11)

mod12 <- stata_plm(paste("F1logngaid_pc ~ rsumLDP_VshareVP +
                      logngaid_pc + ",
                         controlsH), pre.red.pdf,
                   model = "within", effect = "twoways",
                   cluster = "muncode_num")

summary(mod12)

stargazer(mod9, mod10, mod11, mod12, 
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))










# ------------------------------------------------
# Table 6.4 
#
# (within_dist_T4_best)
#

pre.red <- subset(elec.dat, elec.dat$year <= 1993 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

mod15 <- stata_plm(paste("F1logngaid_pc ~ bestLDP_VshareVP +
                      bestSeniorLDP_VshareVP + logngaid_pc + ",
                         controls), pre.red.pdf,
                   model = "within", effect = "twoways",
                   cluster = "muncode_num")

summary(mod15)

mod13 <- stata_plm(paste("F1logngaid_pc ~ bestLDPp_VshareVP +
                      logngaid_pc + ",
                         controls), pre.red.pdf,
                   model = "within", effect = "twoways",
                   cluster = "muncode_num")

summary(mod13)

mod16 <- stata_plm(paste("F1logngaid_pc ~ bestnonLDP_VshareVP +
                      logngaid_pc + ",
                         controls), pre.red.pdf,
                   model = "within", effect = "twoways",
                   cluster = "muncode_num")

summary(mod16)

mod17 <- stata_plm(paste("F1logngaid_pc ~ bestloseLDP_VshareVP +
                      logngaid_pc + ",
                         controls), pre.red.pdf,
                   model = "within", effect = "twoways",
                   cluster = "muncode_num")

summary(mod17)


# -----------------------------------------------------
# Print table:

stargazer(mod15, mod13, mod16, mod17,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))









# ------------------------------------------------
# Table A.3 
#
# (within_dist_T4_sum)
#

pre.red <- subset(elec.dat, elec.dat$year <= 1993 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

mod15 <- stata_plm(paste("F1logngaid_pc ~ sumLDP_VshareVP +
                      sumSeniorLDP_VshareVP + logngaid_pc + ",
                         controls), pre.red.pdf,
                   model = "within", effect = "twoways",
                   cluster = "muncode_num")

summary(mod15)

mod13 <- stata_plm(paste("F1logngaid_pc ~ sumLDPp_VshareVP +
                      logngaid_pc + ",
                         controls), pre.red.pdf,
                   model = "within", effect = "twoways",
                   cluster = "muncode_num")

summary(mod13)

mod16 <- stata_plm(paste("F1logngaid_pc ~ sumnonLDP_VshareVP +
                      logngaid_pc + ",
                         controls), pre.red.pdf,
                   model = "within", effect = "twoways",
                   cluster = "muncode_num")

summary(mod16)

mod17 <- stata_plm(paste("F1logngaid_pc ~ sumloseLDP_VshareVP +
                      logngaid_pc + ",
                         controls), pre.red.pdf,
                   model = "within", effect = "twoways",
                   cluster = "muncode_num")

summary(mod17)


# -----------------------------------------------------
# Print table:

stargazer(mod15, mod13, mod16, mod17,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))










# ------------------------------------------------
# TABLE 6.6 
#
# (within_dist_post)

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

pre.red0 <- subset(elec.dat, elec.dat$year > 1993 & is.na(elec.dat$muncode_num) == F  & elec.dat$tmt_possible==1)
pre.red <- subset(pre.red0, pre.red0$year!="2009")

# How many unique district years?
length(unique(pre.red$district_year))

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

# ---------------------------
mod5 <- stata_plm(paste("F1logngaid_pc ~ sumLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod5)

# ---------------------------
# rsum

mod6 <- stata_plm(paste("F1logngaid_pc ~ rsumLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod6)

# --------------------------- 
# Print table:

stargazer(mod5, mod6,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))





# ------------------------------------------------
# Calculating within-municipality shifts in electoral support and rankings:

unique_munis <- unique(pre.red$muncode_num)
length(unique_munis)
# 3606

counts <- c()

for(i in 1:length(unique_munis)){
  d <- subset(pre.red, pre.red$muncode_num==unique_munis[i])
  count <- nrow(d)
  counts <- c(counts, count)
}

dta <- data.frame(unique_munis, counts)
summary(dta$counts)
sum(dta$counts==1)
# 103
sum(dta$counts==2)
# 321
sum(dta$counts==3)
# 2068
sum(dta$counts==4)
# 21
sum(dta$counts==5)
# 74
sum(dta$counts==6)
# 824

# How much movement in key variable?
check <- tapply(pre.red.pdf$sumLDP_VshareVP, pre.red.pdf$muncode_num, sd)
check <- as.data.frame(check)
summary(check[,1])
# mean: 0.1
# NA means there is no variation

check <- tapply(pre.red.pdf$rsumLDP_VshareVP, pre.red.pdf$muncode_num, sd)
check <- as.data.frame(check)
summary(check[,1])
# mean: 0.0986

check <- tapply(pre.red.pdf$abs.rsumLDP_VshareVP, pre.red.pdf$muncode_num, sd)
check <- as.data.frame(check)
summary(check[,1])
# 2.12











# ------------------------------------------------
# TABLE 6.7 
#
# (within_dist_postapp)
#

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

# Limit to municipalities in post-1996 period in t-possible districts without LDP winners:
pre.red0 <- subset(elec.dat, elec.dat$year >= 1996 & is.na(elec.dat$muncode_num) == F
                   & elec.dat$cand_01_pty!="LDP" & elec.dat$tmt_possible==1)
pre.red <- subset(pre.red0, pre.red0$year!="2009")

# How many unique district years?
length(unique(pre.red$district_year))
# 341

# How many of these districts were won by LDPI candidates?
check <- pre.red[, c("district_year", "cand_01_pty")]
nrow(check)
check2 <- check[!duplicated(check),]
nrow(check2)
table(check2$cand_01_pty) 
# 41

# How many of these districts were won by DPJ candidates?
table(check2$cand_01_pty)
# 172

# Examine the effect of municipality-level support for LDP candidate who lost:
pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

moda <- stata_plm(paste("F1logngaid_pc ~ sumloseLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(moda)

# Examine the effect of municipality-level support for LDP candidate who lost
#but this LDP candidate entered via PR:

pre.red0 <- subset(elec.dat, elec.dat$year >= 1996 & is.na(elec.dat$muncode_num) == F
                   & elec.dat$cand_01_pty!="LDP" & elec.dat$resurrectSSD==1  & elec.dat$tmt_possible==1)
pre.red <- subset(pre.red0, pre.red0$year!="2009")

length(unique(pre.red$district_year))
# 119 districts

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

modb <- stata_plm(paste("F1logngaid_pc ~ sumloseLDP_VshareVP +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(modb)

# print table

stargazer(moda , modb,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))












# ------------------------------------------------
# TABLE 6.8
#
# (within_dist_postapp1)

# ---------------------------------------------
# Model 1

pre.red0 <- subset(elec.dat, elec.dat$year >= 1996 & is.na(elec.dat$muncode_num) == F
                   & elec.dat$cand_01_pty=="LDPI" & elec.dat$tmt_possible==1)

pre.red <- subset(pre.red0, pre.red0$year!="2009")

length(unique(pre.red$district_year))

# Make a variable for how much support the municipality gives
# the candidate who places first in the district:

pre.red$votes_first <- pre.red$cand_01_votes/pre.red$mun_voting_pop

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

mod1 <- stata_plm(paste("F1logngaid_pc ~ votes_first +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod1)





# ---------------------------------------------
# Model 2

pre.red0 <- subset(elec.dat, elec.dat$year >= 1996 & is.na(elec.dat$muncode_num) == F
                   & elec.dat$cand_01_pty=="NFP" & elec.dat$tmt_possible==1)

pre.red <- subset(pre.red0, pre.red0$year!="2009")

length(unique(pre.red$district_year))
# 60

# Make a variable for how much support the municipality gives
# the candidate who places first in the district:
pre.red$votes_first <- pre.red$cand_01_votes/pre.red$mun_voting_pop

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

mod2 <- stata_plm(paste("F1logngaid_pc ~ votes_first +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod2)


# ---------------------------------------------
# Model 3

pre.red0 <- subset(elec.dat, elec.dat$year >= 1996 & is.na(elec.dat$muncode_num) == F
                   & elec.dat$cand_01_pty=="DPJ" & elec.dat$tmt_possible==1)

pre.red <- subset(pre.red0, pre.red0$year!="2009")

length(unique(pre.red$district_year))
# 172

# Make a var for support for first-placed candidate:
pre.red$votes_first <- pre.red$cand_01_votes/pre.red$mun_voting_pop

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

mod3 <- stata_plm(paste("F1logngaid_pc ~ votes_first +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod3)

# ---------------------------------------------
# Model 4

pre.red0 <- subset(elec.dat, elec.dat$year >= 1996 & is.na(elec.dat$muncode_num) == F
                   & elec.dat$cand_01_pty=="DPJ"  & elec.dat$tmt_possible==1)

pre.red <- subset(pre.red0, pre.red0$year!="2009" & pre.red0$resurrectSSD==0)

length(unique(pre.red$district_year)) 
# 104

# Make a var for support for first-placed candidate:
pre.red$votes_first <- pre.red$cand_01_votes/pre.red$mun_voting_pop

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

mod4 <- stata_plm(paste("F1logngaid_pc ~ votes_first +
                      logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod4)

# Print table

stargazer(mod1, mod2, mod3, mod4,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))







# ------------------------------------------------
# TABLE 6.3
#
# (within_dist_T3)
#

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

pre.red <- subset(elec.dat, elec.dat$year <= 1993 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)

# Make first-differences dataset

pre.red <- pre.red %>% mutate(cat_year = as.numeric(as.factor(year)))

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "cat_year"))

pre.red.pdf2 <- pre.red.pdf[,c("F1logngaid_pc", 
                               "bestLDP_VshareVP", "rbestLDP_VshareVP", "sumLDP_VshareVP", "rsumLDP_VshareVP", 
                               "mun_ceif", "needy_pc", "primary_pc", "lnpop", "logincome_pc", "mun_population_density", 
                               "HI", "DISmal", "numLDPwin", "numLDPcand", 
                               "muncode_num", "cat_year", "district_year")]
ncol(pre.red.pdf2)

# Take first differences for variables in columns 1:15,
# keep columns 16, 17, 18 intact

diff_pdat <- pre.red.pdf2
for (c in 1:15) {
  diff_pdat[,c] <- diff(diff_pdat[,c])
}


# ------------------------------------
# Make differenced dataset into a pdata.frame with correct index.

diff_pdat2 <- as.data.frame(diff_pdat)
diff_pdat3 <- pdata.frame(diff_pdat2, index = c("muncode_num", "district_year"))

# ------------------------------------------------------
# Regressions

diff1a <- stata_plm(paste("F1logngaid_pc ~ bestLDP_VshareVP +",
                          controls), data = diff_pdat3,
                    model = "within", effect = "twoways",
                    cluster = "muncode_num")
summary(diff1a)

diff2a <- stata_plm(paste("F1logngaid_pc ~ rbestLDP_VshareVP +",
                          controls), data = diff_pdat3,
                    model = "within", effect = "twoways",
                    cluster = "muncode_num")
summary(diff2a)

diff3a <- stata_plm(paste("F1logngaid_pc ~ sumLDP_VshareVP +",
                          controls), data = diff_pdat3,
                    model = "within", effect = "twoways",
                    cluster = "muncode_num")
summary(diff3a)

diff4a <- stata_plm(paste("F1logngaid_pc ~ rsumLDP_VshareVP +",
                          controls), data = diff_pdat3,
                    model = "within", effect = "time",
                    cluster = "muncode_num")
summary(diff4a)

stargazer(diff1a, diff2a, diff3a, diff4a,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))










# ------------------------------------------------
# TABLE 6.5
#
# (within_dist_T5)
#

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

# Add district-level controls, including district_year as we use lm() below:
controlsd <- paste(controls, " + HI + DISmal + numLDPwin + numLDPcand + district_year")

pre.red <- subset(elec.dat, elec.dat$year <= 1993 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)

# make first-differenced data set:
pre.red <- pre.red %>% mutate(cat_year = as.numeric(as.factor(year)))

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "cat_year"))

pre.red.pdf2 <- pre.red.pdf[,c("F1logngaid_pc", 
                               "bestLDP_VshareVP", "rbestLDP_VshareVP", "sumLDP_VshareVP", "rsumLDP_VshareVP", 
                               "mun_ceif", "needy_pc", "primary_pc", "lnpop", "logincome_pc", "mun_population_density", 
                               "HI", "DISmal", "numLDPwin", "numLDPcand", 
                               "muncode_num", "cat_year", "district_year")]


# Take first differences for variables in columns 1:15,
# keep columns 16, 17, 18 intact

diff_pdat <- pre.red.pdf2
for (c in 1:15) {
  diff_pdat[,c] <- diff(diff_pdat[,c])
}

# Add lagged and current variables to the first-differenced data set (diff_pdat)

diff_pdat <- diff_pdat %>% mutate(
  
  # Add a few lagged variables onto diff_pdat:  
  lag.year = NA,
  lag.abs.rsumLDP_VshareVP = NA,
  lag.district_year = NA,
  lag.F1logngaid_pc = NA,
  
  # Add non-first differenced variables from pre.red.pdf.
  # These pertain to current election year:
  curr.year = pre.red.pdf$year,
  curr.F1logngaid_pc = pre.red.pdf$F1logngaid_pc,
  curr.logngaid_pc = pre.red.pdf$logngaid_pc,
  curr.bestLDP_VshareVP = pre.red.pdf$bestLDP_VshareVP,
  curr.rbestLDP_VshareVP = pre.red.pdf$rbestLDP_VshareVP,
  curr.sumLDP_VshareVP = pre.red.pdf$sumLDP_VshareVP,
  curr.rsumLDP_VshareVP = pre.red.pdf$rsumLDP_VshareVP,
  curr.abs.rsumLDP_VshareVP = pre.red.pdf$abs.rsumLDP_VshareVP,
  curr.mun_ceif = pre.red.pdf$mun_ceif,
  curr.needy_pc = pre.red.pdf$needy_pc,
  curr.primary_pc = pre.red.pdf$primary_pc,
  curr.lnpop = pre.red.pdf$lnpop,
  curr.logincome_pc = pre.red.pdf$logincome_pc,
  curr.mun_population_density = pre.red.pdf$mun_population_density,
  
)

# add lagged variables from pre.red.pdf here
for (var in c("lag.year", "lag.abs.rsumLDP_VshareVP", "lag.district_year",
              "lag.F1logngaid_pc")) {
  orig_var <- sub("lag.", "", var)
  diff_pdat[,var] <- lag(pre.red.pdf[,orig_var])
}

# --------------------------------------------
# Create a data frame with municipalities that placed first and second in the previous election:

slip_dat <- subset(diff_pdat, diff_pdat$lag.abs.rsumLDP_VshareVP<=2)
nrow(slip_dat)
# 921

table(slip_dat$lag.abs.rsumLDP_VshareVP)
# 462 placed first and 459 placed second in the last election

# How many observations remain in top two in current election?
check <- subset(slip_dat, slip_dat$curr.abs.rsumLDP_VshareVP <= 2)
nrow(check)
# 473 ** typo in the book, should be 473, not 173.
nrow(check)/nrow(slip_dat)
# 51.3 of municipalities.

# Create a new variable that indicates how far a municipality that ranked
# first or second in the previous election slipped between elections (positive scores
# mean it increased rank, 0 means it stayed the same, negative scores mean it slipped
# in rank):

slip_dat$slippage <- slip_dat$lag.abs.rsumLDP_VshareVP - slip_dat$curr.abs.rsumLDP_VshareVP
mean(na.omit(slip_dat$slippage))
# -4.439779
sum(na.omit(slip_dat$slippage==0))
# 321
check <- subset(slip_dat, slip_dat$slippage == 0)
nrow(check)/nrow(slip_dat)
# 0.3485342

# ----------------------------
# Model 1

slip1 <- lm(F1logngaid_pc ~ rsumLDP_VshareVP + curr.logngaid_pc + as.factor(curr.year),
            data=slip_dat)
summary(slip1)

# ----------------------------
# Model 2

slip3 <- lm(F1logngaid_pc ~ rsumLDP_VshareVP + curr.logngaid_pc + 
              curr.mun_ceif + curr.needy_pc + curr.primary_pc + curr.lnpop + curr.logincome_pc + curr.mun_population_density + 
              as.factor(curr.year),
            data=slip_dat)
summary(slip3)

stargazer(slip1, slip3,
          omit="as.factor",
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))













# ---------------------------------------------
# TABLE 6.9 

# NOTE:

# Unfortunately, Table 6.9 presents the three models on a different sample than the
# one described in the text.

# In the text (p. 204-206), it says that Models 1 and 2 are conducted on all tournament-possible
# districts in 2009 and Model 3 is conducted on all tournament-possible districts in 2009
# without LDP winners.

# Upon preparing the replication files, I realized that all three models had been conducted on
# tournament-possible districts in 2009 that did not see an LDP candidate lose and be resurrected 
# via PR.  Thus, the sample includes districts with DPJ winners, districts with LDP winners, and
# districts with LDP losers, except those resurrected in PR.

# Below, let me present a replication of Table 6.9 as it is in the book, followed by a replication
# of Table 6.9 as it should have appeared in the book.



# ---------------------------------------
# REPLICATION OF TABLE 6.9 AS IT IS IN THE BOOK:

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

pre.red <- subset(elec.dat, elec.dat$year == 2009 & is.na(elec.dat$muncode_num) == F & 
                    elec.dat$resurrectSSD==0 & elec.dat$tmt_possible==1)

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "hor_electoral_district"))

length(unique(pre.red.pdf$district_year))

# ---------------------------------
# Model 1

mod1 <- stata_plm(paste("F1logngaid_pc ~ sumDPJ_vsharevp + 
                        logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod1)

# ---------------------------------
# Model 2

mod2 <- stata_plm(paste("F1logngaid_pc ~ sumLDP_VshareVP + 
                        logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod2)


# ---------------------------------
# Model 3

mod3 <- stata_plm(paste("F1logngaid_pc ~ sumloseLDP_VshareVP + 
                        logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod3)


# --------------------------------
# print table
stargazer(mod1, mod2, mod3,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))




# ---------------------------------------
# TABLE 6.9 AS IT SHOULD HAVE BEEN IN THE BOOK:

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

pre.red <- subset(elec.dat, elec.dat$year == 2009 & is.na(elec.dat$muncode_num) == F & 
                    elec.dat$tmt_possible==1)

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "hor_electoral_district"))

length(unique(pre.red.pdf$district_year))

# ---------------------------------
# Model 1

mod1 <- stata_plm(paste("F1logngaid_pc ~ sumDPJ_vsharevp + 
                        logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod1)

# ---------------------------------
# Model 2

mod2 <- stata_plm(paste("F1logngaid_pc ~ sumLDP_VshareVP + 
                        logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod2)

# ---------------------------------
# Model 3

pre.red <- subset(elec.dat, elec.dat$year == 2009 & is.na(elec.dat$muncode_num) == F & 
                    elec.dat$cand_01_pty!="LDP" & elec.dat$tmt_possible==1)
pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "hor_electoral_district"))

length(unique(pre.red.pdf$district_year))

mod3 <- stata_plm(paste("F1logngaid_pc ~ sumloseLDP_VshareVP + 
                        logngaid_pc + ",
                        controls), pre.red.pdf,
                  model = "within", effect = "time",
                  cluster = "muncode_num")
summary(mod3)

# --------------------------------
# print table
stargazer(mod1, mod2, mod3,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))

























# ---------------------------------------------
# TABLE A.4 
#
# \ref{convextable}
#

library(margins)
library(sandwich)
library(car)

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

pre.red <- subset(elec.dat, elec.dat$year!= 2009 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)
pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))
pre.red.pdf$Ftransfer <- 100 * pre.red.pdf$F1ngaid_pc


# ------------------------------
# Model 1

cubic.rbest <- stata_plm(paste("Ftransfer ~ rbestLDP_VshareVP +
                         I(rbestLDP_VshareVP^2) + 
                         I(rbestLDP_VshareVP^3) + ",
                               controls), data = pre.red.pdf,
                         model = "within", effect = "time",
                         cluster = "muncode_num")
summary(cubic.rbest)

linearHypothesis(cubic.rbest, c("rbestLDP_VshareVP", "I(rbestLDP_VshareVP^2)", "I(rbestLDP_VshareVP^3)"))

# ------------------------------
# Model 2

cubic.rsum <- stata_plm(paste("Ftransfer ~ rsumLDP_VshareVP +
                         I(rsumLDP_VshareVP^2) + 
                         I(rsumLDP_VshareVP^3) + ",
                              controls), data = pre.red.pdf,
                        model = "within", effect = "time",
                        cluster = "muncode_num")
summary(cubic.rsum)

linearHypothesis(cubic.rsum, c("rsumLDP_VshareVP", "I(rsumLDP_VshareVP^2)", "I(rsumLDP_VshareVP^3)"))



# -------------------------
# print

stargazer(cubic.rbest, cubic.rsum,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))





# --------------------------------------
# Substantive effects from model cubic.rbest:

coef(cubic.rbest)

cubic_effect <- function(model, indep_rows = c(1,2,3), interval = 0.05, start = NULL, spec_points = NULL){
  ### Arguments
  ## "model" requires an lm or plm object
  ## "indep_rows" asks the indices of coefficients for the linear, square, and cubic independent variables
  ## "interval" specifies an interval for the grid of an independent variable
  ## "start" specifies the starting point of an interval for the grid of an independent variable.
  ##          If you leave this blank, it will be set at the minimum value
  ## "spec_points" should be a numeric vector of length 2. 
  # If you do not want a grid, but just the difference between
  ## two points, e.g. between VShare 0.9 and 0.95, you can put these two values.
  
  ##  Extract coefficients for the cubic variable
  coefs <- coef(model)[indep_rows]
  
  ## If spec_points is not specified, estimation is done for a grid
  if(is.null(spec_points)){
    ## If start is not specified, the starting point is the minimum
    if(is.null(start)){
      ## Range of the independent variable
      grid_range = range(model$model[,indep_rows[1] + 1])
    } else{
      ## If start is given, the range will be from "start" to the maximum
      grid_range = c(start, max(model$model[, indep_rows[1] + 1]))
    }
    ## grid is made according to the range with the interval specified by "interval"
    grid <- seq(grid_range[1], grid_range[2], by = interval)
  } else{
    grid <- spec_points
  }
  ## Squared and cubic grids are added
  grid_cubic <- rbind(t(grid), t(grid^2))
  grid_cubic <- rbind(grid_cubic, t(grid^3))
  
  ## Effects from these three variables are aggregated
  linear_pred <- t(grid_cubic) %*% coefs
  ## Row names are the grid points
  row.names(linear_pred) <- grid_cubic[1,]
  return(linear_pred)
}

# Displaying differences on a grid, which goes from 0 to 1 in 0.05 intervals:
grid_pred <- cubic_effect(cubic.rbest)
grid_pred

# Displaying difference going from 0.95 and 1:
two_points_0.9 <- cubic_effect(cubic.rbest, spec_points = c(0.95, 1))
(two_points_0.9[2] - two_points_0.9[1])
# 0.3205968

# Multiply this by units of dependent var, which are in 10,000:
increase_in_money <- (two_points_0.9[2] - two_points_0.9[1])*10000
increase_in_money
# 3205.968

# Mean of Ftransfer?
newdat <- cubic.rbest$model
mean(na.omit(newdat$Ftransfer))*10000
# 40884.99

# as percentage
(((two_points_0.9[2] - two_points_0.9[1])*10000)/(mean(na.omit(newdat$Ftransfer))*10000))*100

# how many people in average muni?
avg.pop <- exp(mean(newdat$lnpop))

# how much extra money is this?
increase_in_money*avg.pop
# 43070932

# Displaying difference going from 0.80 and 0.85:
two_points_0.8 <- cubic_effect(cubic.rbest, spec_points = c(0.80, 0.85))
(two_points_0.8[2] - two_points_0.8[1])
# 0.1969691

# Multiply this by units of the dependent var, which are in 10,000:
increase_in_money <- (two_points_0.8[2] - two_points_0.8[1])*10000
increase_in_money
# 1969.691

# Displaying difference going from 0.6 and 0.55:
two_points_0.5 <- cubic_effect(cubic.rbest, spec_points = c(0.55, 0.6))
(two_points_0.5[2] - two_points_0.5[1])

# Multiply this by units of the dependent var, which are in 10,000:
increase_in_money <- (two_points_0.5[2] - two_points_0.5[1])*10000
increase_in_money
# 381.7568









# -----------------------------------------
# FIGURE 6.1 
#
# \red{convexfigfull}

pre.red <- subset(elec.dat, elec.dat$year!= 2009 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)
pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))
pre.red.pdf$Ftransfer <- 100 * pre.red.pdf$F1ngaid_pc

cubic.rbest.lm <- lm(paste("Ftransfer ~ -1 + rbestLDP_VshareVP +
                         I(rbestLDP_VshareVP^2) + I(rbestLDP_VshareVP^3) +",
                           controls, "+ district_year"), data = pre.red.pdf)
summary(cubic.rbest.lm)

## A function to calculate VCOV in the same way as Stata.
cl_stata <- function(fm, cluster, dfadj, vcov=FALSE) {
  library(sandwich)
  library(lmtest)
  library(zoo)
  ## number of clusters
  M <- length(unique(cluster))
  ## num of obs
  N <- length(cluster)
  ## Stata style correction.
  ## dfadj is used to discount the loss of DF with respect to LEyear_district.
  dfc <- (M/(M-1))*((N-1)/(N-fm$rank + dfadj))
  ## calculation of VCOV
  u <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum))
  vcovCL <-dfc * sandwich (fm, meat = crossprod(u)/N)
  if(vcov==TRUE){
    return(vcovCL)
  } else{
    coeftest(fm, vcovCL)
  }
}

mod <- cubic.rbest.lm$model

## clustered VCOV
clvcov <- cl_stata(cubic.rbest.lm, mod$district_year, dfadj=length(unique(mod$district_year))+1, vcov = T)

## A function to calculate modes.
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

## Calculate the marginal prediction at means; all covariates except for district_year 
# are fixed at their means.  For district_year, use mode

atmean <- c()
for (i in 1:(ncol(mod)-1)) {
  atmean <- c(atmean, mean(as.numeric(mod[,i])))
  print(i)
}

## For district_year, mode is used.
atmean <- c(atmean, as.character(getmode(mod$district_year)))
names(atmean) <- colnames(mod)
atmean

preddat <- data.frame()
for (i in 1:101) {
  this <- as.numeric(atmean[-c(1,3,4, 11)])
  this <- c(this, atmean[11])
  ## rbestLDP_VshareVP is the variable at which the marginal prediction is calculated.
  ## Set the values from 0 to 1 by 0.01.
  this[1] <- (i-1)/100
  names(this) <- c("rbestLDP_VshareVP", 
                   "mun_ceif", "needy_pc", "primary_pc", "lnpop", "logincome_pc", "mun_population_density", 
                   "district_year")
  preddat <- rbind(preddat, as.data.frame(t(this)))
}

for (j in 1:(ncol(preddat)-1)) {
  preddat[,j] <- as.numeric(as.character(preddat[,j]))
}

## car::Predict allows us to calculate confidence intervals with any given VCOV.
pred_margin <- Predict(cubic.rbest.lm, newdata = preddat, interval = "confidence", vcov. = clvcov)
matplot(pred_margin, type = "l", col = c(1,1,1), xaxt = "n", cex.lab = 1.5, xlab = "Rank (Best LDP VS)", 
        ylab = "Post-Election Per Capita Transfers")
axis(1, at= c(0,20,40,60,80,100), labels = c(0, .2, .4, .6, .8, 1))












# ---------------------------------------------
# TABLE A.5
#
# \ref{convextable2}

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

pre.red <- subset(elec.dat, elec.dat$year <= 1993 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)
pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))
pre.red.pdf$Ftransfer <- 100 * pre.red.pdf$F1ngaid_pc

# ------------------------------
# Model 1

cubic.rbest <- stata_plm(paste("Ftransfer ~ rbestLDP_VshareVP +
                         I(rbestLDP_VshareVP^2) + 
                         I(rbestLDP_VshareVP^3) + ",
                               controls), data = pre.red.pdf,
                         model = "within", effect = "time",
                         cluster = "muncode_num")
summary(cubic.rbest)

linearHypothesis(cubic.rbest, c("rbestLDP_VshareVP", "I(rbestLDP_VshareVP^2)", "I(rbestLDP_VshareVP^3)"))

# ------------------------------
# Model 2

cubic.rsum <- stata_plm(paste("Ftransfer ~ rsumLDP_VshareVP +
                         I(rsumLDP_VshareVP^2) + 
                         I(rsumLDP_VshareVP^3) + ",
                              controls), data = pre.red.pdf,
                        model = "within", effect = "time",
                        cluster = "muncode_num")

summary(cubic.rsum)

linearHypothesis(cubic.rsum, c("rsumLDP_VshareVP", "I(rsumLDP_VshareVP^2)", "I(rsumLDP_VshareVP^3)"))

# -------------------------
# print table

stargazer(cubic.rbest, cubic.rsum,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))







# -----------------------------------------
# FIGURE 6.2 
#
# \red{convexfigpre}

pre.red <- subset(elec.dat, elec.dat$year <= 1993 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)
pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))
pre.red.pdf$Ftransfer <- 100 * pre.red.pdf$F1ngaid_pc


cubic.rbest.lm <- lm(paste("Ftransfer ~ -1 + rbestLDP_VshareVP +
                         I(rbestLDP_VshareVP^2) + I(rbestLDP_VshareVP^3) +",
                           controls, "+ district_year"), data = pre.red.pdf)
summary(cubic.rbest.lm)

## A function to calculate VCOV in the same way as Stata.
cl_stata <- function(fm, cluster, dfadj, vcov=FALSE) {
  library(sandwich)
  library(lmtest)
  library(zoo)
  ## number of clusters
  M <- length(unique(cluster))
  ## num of obs
  N <- length(cluster)
  ## Stata style correction.
  ## dfadj is used to discount the loss of DF with respect to LEyear_district.
  dfc <- (M/(M-1))*((N-1)/(N-fm$rank + dfadj))
  ## calculation of VCOV
  u <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum))
  vcovCL <-dfc * sandwich (fm, meat = crossprod(u)/N)
  if(vcov==TRUE){
    return(vcovCL)
  } else{
    coeftest(fm, vcovCL)
  }
}
mod <- cubic.rbest.lm$model

## clustered VCOV
clvcov <- cl_stata(cubic.rbest.lm, mod$district_year, dfadj=length(unique(mod$district_year))+1, vcov = T)

## A function to calculate modes.
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

atmean <- c()
for (i in 1:(ncol(mod)-1)) {
  atmean <- c(atmean, mean(as.numeric(mod[,i])))
  print(i)
}

## For district_year, mode is used.
atmean <- c(atmean, as.character(getmode(mod$district_year)))
names(atmean) <- colnames(mod)
atmean
preddat <- data.frame()
for (i in 1:101) {
  this <- as.numeric(atmean[-c(1,3,4, 11)])
  this <- c(this, atmean[11])
  ## rbestLDP_VshareVP is the variable at which the marginal prediction is calculated.
  ## I set the values from 0 to 1 by 0.01.
  this[1] <- (i-1)/100
  names(this) <- c("rbestLDP_VshareVP", 
                   "mun_ceif", "needy_pc", "primary_pc", "lnpop", "logincome_pc", "mun_population_density", 
                   "district_year")
  preddat <- rbind(preddat, as.data.frame(t(this)))
}

for (j in 1:(ncol(preddat)-1)) {
  preddat[,j] <- as.numeric(as.character(preddat[,j]))
}

pred_margin <- Predict(cubic.rbest.lm, newdata = preddat, interval = "confidence", vcov. = clvcov)
matplot(pred_margin, type = "l", col = c(1,1,1), xaxt = "n", cex.lab = 1.5, xlab = "Rank (Best LDP VS)", 
        ylab = "Post-Election Per Capita Transfers")
axis(1, at= c(0,20,40,60,80,100), labels = c(0, .2, .4, .6, .8, 1))








# ---------------------------------------------
# TABLE A.6
# 
# {convextable3}

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

pre.red0 <- subset(elec.dat, elec.dat$year > 1993 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)
pre.red <- subset(pre.red0, pre.red0$year!=2009)

pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))

pre.red.pdf$Ftransfer <- 100 * pre.red.pdf$F1ngaid_pc

# ------------------------------
# Model 1

cubic.rsum <- stata_plm(paste("Ftransfer ~ rsumLDP_VshareVP +
                         I(rsumLDP_VshareVP^2) + 
                         I(rsumLDP_VshareVP^3) + ",
                              controls), data = pre.red.pdf,
                        model = "within", effect = "time",
                        cluster = "muncode_num")

linearHypothesis(cubic.rsum, c("rsumLDP_VshareVP", "I(rsumLDP_VshareVP^2)", "I(rsumLDP_VshareVP^3)"))

# -------------------------
# print

stargazer(cubic.rsum,
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))







# -----------------------------------------
# FIGURE 6.3 
#
# \red{convexfigpost}

pre.red0 <- subset(elec.dat, elec.dat$year > 1993 & is.na(elec.dat$muncode_num) == F & elec.dat$tmt_possible==1)
pre.red <- subset(pre.red0, pre.red0$year!=2009)
pre.red.pdf <- pdata.frame(pre.red, index = c("muncode_num", "district_year"))
pre.red.pdf$Ftransfer <- 100 * pre.red.pdf$F1ngaid_pc

cubic.rsum.lm <- lm(paste("Ftransfer ~ -1 + rsumLDP_VshareVP +
                         I(rsumLDP_VshareVP^2) + I(rsumLDP_VshareVP^3) +",
                           controls, "+ district_year"), data = pre.red.pdf)
summary(cubic.rsum.lm)

## A function to calculate VCOV in the same way as Stata.
cl_stata <- function(fm, cluster, dfadj, vcov=FALSE) {
  library(sandwich)
  library(lmtest)
  library(zoo)
  ## number of clusters
  M <- length(unique(cluster))
  ## num of obs
  N <- length(cluster)
  ## Stata style correction.
  ## dfadj is used to discount the loss of DF with respect to LEyear_district.
  dfc <- (M/(M-1))*((N-1)/(N-fm$rank + dfadj))
  ## calculation of VCOV
  u <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum))
  vcovCL <-dfc * sandwich (fm, meat = crossprod(u)/N)
  if(vcov==TRUE){
    return(vcovCL)
  } else{
    coeftest(fm, vcovCL)
  }
}
mod <- cubic.rsum.lm$model

## clustered VCOV
clvcov <- cl_stata(cubic.rsum.lm, mod$district_year, dfadj=length(unique(mod$district_year))+1, vcov = T)

## A function to calculate modes.
getmode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

atmean <- c()
for (i in 1:(ncol(mod)-1)) {
  atmean <- c(atmean, mean(as.numeric(mod[,i])))
  print(i)
}

## For district_year, mode is used.
atmean <- c(atmean, as.character(getmode(mod$district_year)))
names(atmean) <- colnames(mod)
atmean
preddat <- data.frame()
for (i in 1:101) {
  this <- as.numeric(atmean[-c(1,3,4, 11)])
  this <- c(this, atmean[11])
  this[1] <- (i-1)/100
  names(this) <- c("rsumLDP_VshareVP", 
                   "mun_ceif", "needy_pc", "primary_pc", "lnpop", "logincome_pc", "mun_population_density", 
                   "district_year")
  preddat <- rbind(preddat, as.data.frame(t(this)))
}

for (j in 1:(ncol(preddat)-1)) {
  preddat[,j] <- as.numeric(as.character(preddat[,j]))
}

pred_margin <- Predict(cubic.rsum.lm, newdata = preddat, interval = "confidence", vcov. = clvcov)
matplot(pred_margin, type = "l", col = c(1,1,1), xaxt = "n", cex.lab = 1.5, xlab = "Rank (Sum LDP VS)", 
        ylab = "Post-Election Per Capita Transfers")
axis(1, at= c(0,20,40,60,80,100), labels = c(0, .2, .4, .6, .8, 1))









# ----------------------------------------------------
# FIGURE 6.4
#
# Campaign manifesto analyses



#---------------------------------
# Read in file I created for my 2016 book, which is the JHRED data (version 2015)
# matched to the file names of candidate manifestos in my collection.

library(readr)
rs <- read_csv("RSC_JHRED_12-23-2016.csv",
               locale = locale(encoding = "SHIFT-JIS"))

rs2 <- subset(rs, is.na(rs$file) == F)
nrow(rs2)
# 10171
table(rs2$year)





#---------------------------------
# Read in topics data from my 2016 book, and use file name
# variable to attach topics to the above data.

topics <- read_csv("Catalinac_merged_with_topics.csv",
                   locale = locale(encoding = "SHIFT-JIS"))

# Prune to data with file populated
topics2 <- subset(topics, is.na(topics$file)==F)
dim(topics2)
# all in there

# Make sure all files in topics are in rs2.

n <- topics2$file %in% rs2$file
table(n) #1 is not.

# due to a Japanese character mismatch
check <- subset(topics2, !(topics2$file %in% rs2$file))
check$file
# X2000.148.���䌧��Q��Ϸ����؃}�L�m���.txti
# fix in rs2 it is "X2000.148.���䌧��Q��Ϸ����؃}�L�m����.txt"
# fix this

# Let us edit topics2 so that file name matches the one in rs2
topics2$filenew <- ifelse(topics2$file=="X2000.148.���䌧��Q��Ϸ����؃}�L�m���.txt",
                          paste("X2000.148.���䌧��Q��Ϸ����؃}�L�m����.txt"), topics2$file
)
n <- topics2$filenew %in% rs2$file
table(n)
# yes

topics2$filenew
topics2$filenew <- NULL

# ----------
# Prune topics2 down to needed vars:
  
names(topics2)

topics3 <- topics2[, -c(1, 2, 4, 5, 6)]

# attach topics3 to rs2

mer <- merge(rs2, topics3, by="file", all=F)
names(mer)

table(mer$year)

# matches table(topics2$year)
# table(topics2$year)






#---------------------------------
# Read in no_t_all coding, and attach onto mer.

# Adjust year==1990 to year==1989 in mer:

mer$year_adj <- ifelse(mer$year=="1990", paste("1989"), mer$year)
table(mer$year_adj)

# Make year_district variable for mer

# sum(is.na(mer$kucode))
# sum(is.na(mer$year))

mer$district_year <- paste(mer$year_adj, mer$kucode, sep="_")

# Read in no tournaments district:
notourn <- read.csv(file = "no_tmts_districts2.csv")

mer$tmt_possible <- ifelse(mer$district_year %in% notourn$no_t_all, 0, 1) 
table(mer$tmt_possible)








# --------------------------------------------------
# Now, mer contains my topics data, merged with JHRED data.  Candidates for which 
# I do not have topics data are out. nrow is 7497. "tmt_possible" connotes districts where 
# tournaments are possible:

# differences in means:

# In 1986, do LDP candidates candidates in tmt_possible districts discuss more pork?
dat1986ldp <- subset(mer, mer$year==1986 & mer$party_en=="LDP")
mean(dat1986ldp$priv)

dat1986a <- subset(mer, mer$year==1986 & (mer$party_en=="LDP") & mer$tmt_possible==0)
mean(dat1986a$pub)
mean(dat1986a$priv)

dat1986b <- subset(mer, mer$year==1986 & (mer$party_en=="LDP") & mer$tmt_possible==1)
mean(dat1986b$pub)
mean(dat1986b$priv)

t.test(dat1986a$priv, dat1986b$priv)
t.test(dat1986a$pub, dat1986b$pub)
# yes

# In 1990, do LDP candidates in tmt_possible districts discuss more pork?

dat1990ldp <- subset(mer, mer$year==1990 & mer$party_en=="LDP")
mean(dat1990ldp$priv)

dat1990a <- subset(mer, mer$year==1990 & (mer$party_en=="LDP") & mer$tmt_possible==0)
mean(dat1990a$pub)
mean(dat1990a$priv)

dat1990b <- subset(mer, mer$year==1990 & (mer$party_en=="LDP") & mer$tmt_possible==1)
mean(dat1990b$pub)
mean(dat1990b$priv)

t.test(dat1990a$priv, dat1990b$priv)
t.test(dat1990a$pub, dat1990b$pub)
# yes, different

# In 1993, do LDP candidates in tmt_possible districts discuss more pork?

dat1993ldp <- subset(mer, mer$year==1993 & mer$party_en=="LDP")
mean(dat1993ldp$priv)

dat1993a <- subset(mer, mer$year==1993 & (mer$party_en=="LDP") & mer$tmt_possible==0)
mean(dat1993a$pub)
mean(dat1993a$priv)

dat1993b <- subset(mer, mer$year==1993 & (mer$party_en=="LDP") & mer$tmt_possible==1)
mean(dat1993b$pub)
mean(dat1993b$priv)

t.test(dat1993a$priv, dat1993b$priv)
t.test(dat1993a$pub, dat1993b$pub)
# yes, different

# In 1996, do LDP candidates in tmt_possible districts discuss more pork?

dat1996ldp <- subset(mer, mer$year==1996 & mer$party_en=="LDP")
mean(dat1996ldp$priv)

dat1996a <- subset(mer, mer$year==1996 & (mer$party_en=="LDP") & mer$tmt_possible==0)
mean(dat1996a$pub)
mean(dat1996a$priv)

dat1996b <- subset(mer, mer$year==1996 & (mer$party_en=="LDP") & mer$tmt_possible==1)
mean(dat1996b$pub)
mean(dat1996b$priv)

t.test(dat1996a$priv, dat1996b$priv)
t.test(dat1996a$pub, dat1996b$pub)
# yes, different

# In 2000, do LDP candidates in tmt_possible districts discuss more pork?
dat2000ldp <- subset(mer, mer$year==2000 & mer$party_en=="LDP")
mean(dat2000ldp$priv)

dat2000a <- subset(mer, mer$year==2000 & (mer$party_en=="LDP") & mer$tmt_possible==0)
mean(dat2000a$pub)
mean(dat2000a$priv)

dat2000b <- subset(mer, mer$year==2000 & (mer$party_en=="LDP") & mer$tmt_possible==1)
mean(dat2000b$pub)
mean(dat2000b$priv)

t.test(dat2000a$priv, dat2000b$priv)
t.test(dat2000a$pub, dat2000b$pub)
# yes, different

# In 2003, do LDPI candidates in tmt_possible districts discuss more pork?

dat2003ldp <- subset(mer, mer$year==2003 & mer$party_en=="LDP")
mean(dat2003ldp$priv)

dat2003a <- subset(mer, mer$year==2003 & (mer$party_en=="LDP") & mer$tmt_possible==0)
mean(dat2003a$pub)
mean(dat2003a$priv)

dat2003b <- subset(mer, mer$year==2003 & (mer$party_en=="LDP") & mer$tmt_possible==1)
mean(dat2003b$pub)
mean(dat2003b$priv)

t.test(dat2003a$priv, dat2003b$priv)
t.test(dat2003a$pub, dat2003b$pub)
# yes, different

# In 2005, do LDPI candidates in tmt_possible districts discuss more pork?

dat2005ldp <- subset(mer, mer$year==2005 & mer$party_en=="LDP")
mean(dat2005ldp$priv)

dat2005a <- subset(mer, mer$year==2005 & (mer$party_en=="LDP") & mer$tmt_possible==0)
mean(dat2005a$pub)
mean(dat2005a$priv)

dat2005b <- subset(mer, mer$year==2005 & (mer$party_en=="LDP") & mer$tmt_possible==1)
mean(dat2005b$pub)
mean(dat2005b$priv)

t.test(dat2005a$priv, dat2005b$priv)
t.test(dat2005a$pub, dat2005b$pub)
# yes, different

# In 2009, do LDPI candidates in tmt_possible districts discuss more pork?

dat2009ldp <- subset(mer, mer$year==2009 & mer$party_en=="LDP")
mean(dat2009ldp$priv)

dat2009a <- subset(mer, mer$year==2009 & (mer$party_en=="LDP") & mer$tmt_possible==0)
mean(dat2009a$pub)
mean(dat2009a$priv)

dat2009b <- subset(mer, mer$year==2009 & (mer$party_en=="LDP") & mer$tmt_possible==1)
mean(dat2009b$pub)
mean(dat2009b$priv)

t.test(dat2009a$priv, dat2009b$priv)
t.test(dat2009a$pub, dat2009b$pub)
# yes, different






# ----------------------------------------------
# Drawing the figure 
#

# Make a vector of years
years <- c(2009, 2005, 2003, 2000, 1996, 1993, 1990, 1986)

# Discussion of pork:

# All LDP candidates:
all.priv <- c(mean(dat2009ldp$priv), mean(dat2005ldp$priv), mean(dat2003ldp$priv), 
              mean(dat2000ldp$priv), mean(dat1996ldp$priv), 
              mean(dat1993ldp$priv), mean(dat1990ldp$priv), mean(dat1986ldp$priv))

# LDP candidates in non-tournament districts:
not_possible.priv <- c(mean(dat2009a$priv), mean(dat2005a$priv), mean(dat2003a$priv), mean(dat2000a$priv),
                       mean(dat1996a$priv), mean(dat1993a$priv), mean(dat1990a$priv), mean(dat1986a$priv))

# LDP candidates in tournamen-possible districts:
possible.priv <- c(mean(dat2009b$priv), mean(dat2005b$priv), mean(dat2003b$priv), mean(dat2000b$priv),
                   mean(dat1996b$priv), mean(dat1993b$priv), mean(dat1990b$priv), mean(dat1986b$priv))

par(mfrow = c(1, 1), mar = c(4,4,2,1), tcl = -0.25, mgp = c(1.75, 0.6, 0),
    font.main = 1, cex.main = 2)

plot(years, not_possible.priv, cex.lab = 2.5, pch=19, col="black", ylim=c(0,0.8), 
     ylab="Proportion of Discussion of Pork-barrelling", xlab="", xaxt="n", cex.lab=2)

axis(1, at=c(1986, 1990, 1993, 1996, 2000, 2003, 2005, 2009), cex.axis=1.5)
lines(years, not_possible.priv, col="black", lwd =3)
text(x=1995, y=0.23, labels="Tournament-Impossible", cex=2, pos=2, col="black")
text(x=1992, y=0.17, labels="Districts", cex=2, xpd=NA, pos=2, col="black")

points(years, possible.priv, pch=19)
lines(years, possible.priv, lwd=2, type="c")
text(x=2000.5, y=0.65, labels="Tournament-Possible", cex=2, xpd=NA, pos=2)
text(x=1998, y = 0.6, labels="Districts", cex=2, xpd=NA, pos=2)

arrows(1990, 0.27, 1991, 0.35, lwd=3, col = "black")
arrows(1996, 0.57, 1995.5, 0.5, lwd=3, col = "black")












# -----------------------------------
#
# TABLE 6.10
# \{eval}
# -----------------------------------

# Read in a file containing survey data from UTAS survey in 2012.
# Code book is 2012_13UTASV_English_20210517.doc
# Note that I made kucode myself by concatenating PREFEC and HRDIST
# in the Excel file (kucode is label it is given in JHRED).

#library(readr)
rs <- read_csv("2012_voter_survey_data_edits.csv",
               locale = locale(encoding = "SHIFT-JIS"))

length(unique(rs$kucode))
# 241

rs$district_year <- paste("2012", rs$kucode, sep="_")

# Read in no tournaments district:
notourn <- read.csv(file = "no_tmts_districts2.csv")
rs$tmt_possible <- ifelse(rs$district_year %in% notourn$no_t_all, 0, 1) 

# How many responses?
table(rs$tmt_possible)
# 992 observations are in tournament-impossible districts
# 908 observations are in tournament-possible districts

rs.poss <- subset(rs, rs$tmt_possible==1)
length(unique(rs.poss$district_year))
# 112

rs.imposs <- subset(rs, rs$tmt_possible==0)
length(unique(rs.imposs$district_year))
# 129






# -----------------------------------------
# Creating control variables

# Create gender variable

# Q41: Are you a man or a woman? (Q014100). 

# In data file, Q014100 is 1 for a man, 2 for woman.
class(rs$Q014100)
rs$Q014100 <- as.factor(rs$Q014100)
class(rs$Q014100)
rs$gender <- rs$Q014100
table(rs$gender)

# NOTE: 

# In book text, gender variable is described as being coded 0 for a man and 1 for a woman.

# Q42: How old are you as of December 16, 2012? (Q014200)
# Coded in groups: 1 is 20s, 2 is 30s .... 6 is 70's or older.
# Larger numbers are older; keep as is.
rs$age <- rs$Q014200
table(rs$age)









# ------------------------------------
# MODEL 1

# Dependent variable:
# Q4: For which party's candidate did you vote in the single-member district (SMD)? 
# (Q010400)

table(rs$Q010400)
# 10 respondents are coded "99", which is no answer.
# NB: Code book says they are coded "999", but in data file, they are "99".

# Get rid of respondents not giving an answer:
rs.new <- subset(rs, !(rs$Q010400==99))
table(rs.new$Q010400)

# Code the dependent variable: respondents who voted for the LDP in the SSD:
rs.new$voted_for_ldp <- ifelse(rs.new$Q010400==2, 1, 0)
class(rs.new$voted_for_ldp)

# Here, independent variable of interest is tmt_possible

# Run model
mod1 <- lm(voted_for_ldp ~ tmt_possible + age + gender, data =rs.new)
summary(mod1)










# ------------------------------------
# MODEL 2
#

# Use same dependent variable as above (rs.new$voted_for_ldp)

# Create independent variable of interest (likes LDP policies)
# Q9: "how do you evaluate the pledges of the Liberal Democratic Party
# on the policies you answered "most important" in Q6?" 
# Respondents can answer positively, not sure, or negatively.

table(rs.new$Q010900)

# Get rid of 99 (no answer) and 66 (did not vote)
rs.new2 <- subset(rs.new, !(rs.new$Q010900==99|rs.new$Q010900==66))
table(rs.new2$Q010900)

# Make a dummy variable with 1 if respondent answers positively or somewhat positively 
# to this question:
rs.new2$LDP_posit <- ifelse((rs.new2$Q010900==1|rs.new2$Q010900==2), 1, 0)
table(rs.new2$LDP_posit)

# Run model:
mod2 <- lm(voted_for_ldp ~ LDP_posit + age + gender, data = rs.new2)
summary(mod2)







# ------------------------------------
# MODEL 3
#

mod3 <- lm(voted_for_ldp ~ LDP_posit*tmt_possible + age + gender, data = rs.new2)
summary(mod3)



# ------------------------------------
# PRINT TABLE

stargazer(mod1, mod2, mod3,
          no.space = T,
          omit.stat = c("f"),
          star.cutoffs = c(0.05, 0.01, 0.001)
          #out = "models1.txt"
)









# -----------------------------------
#
# TABLE 6.11
# \ref{eval2}
# -----------------------------------

# Use same sample but look only at people who reported voting for the LDP in SSD

rs2 <- subset(rs.new2, rs.new2$voted_for_ldp==1)
nrow(rs2)

mod4 <- lm(LDP_posit ~ tmt_possible, data = rs2)
summary(mod4)

mod5 <- lm(LDP_posit ~ tmt_possible + age + gender, data = rs2)
summary(mod5)

# ---------------------
# Print table
stargazer(mod4, mod5, 
          no.space = T,
          omit.stat = c("f"),
          star.cutoffs = c(0.05, 0.01, 0.001)
          #out = "models1.txt"
)







# ------------------------------------------
# TABLE A.7
#  \ref{within_dist_T6}
# ------------------------------------------

controls <- "mun_ceif + needy_pc + primary_pc + lnpop + logincome_pc + mun_population_density"

saito <- subset(elec.dat, elec.dat$year != 2009 & is.na(elec.dat$muncode_num) == F  & elec.dat$tmt_possible==1)

saito.pdf <- pdata.frame(saito, index = c("muncode_num", "year"))

# -------------------------------------------------------------
# No district-year FE

sai.best <- stata_plm(paste("F1logngaid_pc ~ bestLDP_VshareVP +
                      logngaid_pc + ",
                            controls), saito.pdf,
                      model = "within", effect = "time", cluster="muncode_num")

summary(sai.best)

sai.sum <- stata_plm(paste("F1logngaid_pc ~ sumLDP_VshareVP +
                      logngaid_pc + ",
                           controls), saito.pdf,
                     model = "within", effect = "time", cluster="muncode_num")
summary(sai.sum)

# -------------------------------------------------------------
# Alternative measure of support for the LDP
# all votes cast for LDP cands in the municipality/votes cast, with year FE)

saito.pdf$VotesLDP_cast <- saito.pdf$VotesLDP/saito.pdf$mun_voted

sai.spec2 <- stata_plm(paste("F1logngaid_pc ~ VotesLDP_cast +
                      logngaid_pc + ",
                             controls), saito.pdf,
                       model = "within", effect = "time", cluster = "muncode_num")
summary(sai.spec2)

# print table
stargazer(sai.best, sai.sum, sai.spec2, 
          no.space = T,
          omit.stat = c("f", "adj.rsq"),
          star.cutoffs = c(0.05, 0.01, 0.001))







